In [1]:
import sys
import os
import time
import numpy as np
import pandas as pd
import umap

os.environ['NOVA_HOME'] = '/home/projects/hornsteinlab/Collaboration/NOVA'
print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))


from src.common.utils import load_config_file
from src.embeddings.embeddings_utils import load_embeddings
from src.figures.distances_plotting import *
from src.analysis.analyzer_distances_utils import summarize_times, merge_batches_by_key, correlate_columns

%load_ext autoreload
%autoreload 2
NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
In [2]:
dist_folder = '/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/AlyssaCoyne_new/distances'

Plate 1¶

In [21]:
df2 = pd.read_csv(f'{dist_folder}/batch1_all_reps_Ctrl-EDi022_C9-CS2YNL_SALSPositive-CS2FN3_SALSNegative-CS0ANK_Untreated_without_CD41/distances_stats_euclidean_detailed_multiplexed.csv')
for col in ['label1','label2']:
    df2[col] = df2[col].str.split(pat='_', n=1).str[0]
In [109]:
df2.head()
Out[109]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p5 p10 p25 p50 p75 p90 p95 lower_whisker upper_whisker
0 C9-CS2YNL C9-CS2YNL 108 5778 0.274718 0.272361 0.584648 0.596096 0.616032 0.638600 0.661677 0.681940 0.694304 0.547564 0.730144
1 C9-CS2YNL Ctrl-EDi022 108 9936 0.000804 0.000455 0.928019 0.937688 0.954176 0.971275 0.988407 1.003337 1.012553 0.902828 1.039755
2 C9-CS2YNL SALSNegative-CS0ANK 108 7884 0.000194 0.000362 0.959189 0.967080 0.980337 0.995381 1.009868 1.022334 1.029533 0.936040 1.054165
3 C9-CS2YNL SALSPositive-CS2FN3 108 11664 0.000192 0.000358 0.627389 0.639633 0.658004 0.680004 0.702254 0.722955 0.734710 0.591629 0.768629
4 Ctrl-EDi022 Ctrl-EDi022 92 4186 0.000871 0.000355 0.659427 0.670943 0.690337 0.713140 0.734862 0.753387 0.765508 0.623548 0.801651
In [110]:
plot_custom_boxplot(df2)
In [111]:
plot_dist_histogram(df2)
In [ ]:
plot_distances_heatmap(df2, figsize=(6,6),)# highlight_thresh=df2[df2.label1 == df2.label2]['p50'].max())
In [113]:
plot_cluster_proximity_network(df2, threshold = df2['p50'].quantile(0.9), figsize=(7,7))

Plate 2¶

In [114]:
df2 = pd.read_csv(f'{dist_folder}/batch1_all_reps_Ctrl-EDi029_C9-CS7VCZ_SALSPositive-CS4ZCD_SALSNegative-CS0JPP_Untreated_without_CD41/distances_stats_euclidean_detailed_multiplexed.csv')
for col in ['label1','label2']:
    df2[col] = df2[col].str.split(pat='_', n=1).str[0]
In [115]:
df2.head()
Out[115]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p5 p10 p25 p50 p75 p90 p95 lower_whisker upper_whisker
0 C9-CS7VCZ C9-CS7VCZ 90 4005 0.083844 0.075255 0.612145 0.623272 0.640595 0.662339 0.683128 0.701199 0.713079 0.576797 0.746926
1 C9-CS7VCZ Ctrl-EDi029 90 6930 0.000340 0.011973 0.911534 0.920676 0.934539 0.951351 0.967667 0.981602 0.990094 0.884848 1.017359
2 C9-CS7VCZ SALSNegative-CS0JPP 90 4050 0.024905 0.000328 0.935535 0.945357 0.960833 0.978025 0.995677 1.010444 1.020209 0.908568 1.047942
3 C9-CS7VCZ SALSPositive-CS4ZCD 90 5760 0.000262 0.000263 0.861233 0.870707 0.885855 0.902577 0.919815 0.935384 0.944683 0.834914 0.970756
4 Ctrl-EDi029 Ctrl-EDi029 77 2926 0.000495 0.000268 0.642322 0.653040 0.672254 0.693391 0.715271 0.735617 0.747748 0.607728 0.779796
In [116]:
plot_custom_boxplot(df2)
In [117]:
plot_dist_histogram(df2)
In [ ]:
plot_distances_heatmap(df2, figsize=(6,6),)# highlight_thresh=df2[df2.label1 == df2.label2]['p50'].max())
In [119]:
plot_cluster_proximity_network(df2, threshold = df2['p50'].quantile(0.9), figsize=(7,7))

Plate 3¶

In [120]:
df2 = pd.read_csv(f'{dist_folder}/batch1_all_reps_Ctrl-EDi037_C9-CS8RFT_SALSPositive-CS7TN6_SALSNegative-CS6ZU8_Untreated_without_CD41/distances_stats_euclidean_detailed_multiplexed.csv')
for col in ['label1','label2']:
    df2[col] = df2[col].str.split(pat='_', n=1).str[0]
In [121]:
df2.head()
Out[121]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p5 p10 p25 p50 p75 p90 p95 lower_whisker upper_whisker
0 C9-CS8RFT C9-CS8RFT 102 5151 0.235653 0.163136 0.565796 0.576248 0.594662 0.613787 0.634659 0.652604 0.664471 0.534668 0.694653
1 C9-CS8RFT Ctrl-EDi037 102 5508 0.000990 0.000451 0.954982 0.961930 0.974423 0.988401 1.002261 1.014767 1.022264 0.932666 1.044018
2 C9-CS8RFT SALSNegative-CS6ZU8 102 6732 0.000194 0.000350 0.918369 0.923695 0.934535 0.946592 0.959193 0.969438 0.975672 0.897548 0.996181
3 C9-CS8RFT SALSPositive-CS7TN6 102 13158 0.000192 0.000362 0.891273 0.899982 0.913935 0.928961 0.944050 0.957357 0.965517 0.868762 0.989223
4 Ctrl-EDi037 Ctrl-EDi037 54 1431 0.000570 0.032743 0.626692 0.638666 0.657642 0.677068 0.697054 0.714038 0.728015 0.598523 0.756173
In [122]:
plot_custom_boxplot(df2)
In [123]:
plot_dist_histogram(df2)
In [ ]:
plot_distances_heatmap(df2, figsize=(6,6),)# highlight_thresh=df2[df2.label1 == df2.label2]['p50'].max())
In [125]:
plot_cluster_proximity_network(df2, threshold = df2['p50'].quantile(0.9), figsize=(7,7))

All Plates¶

In [126]:
df2 = pd.read_csv(f'{dist_folder}/batch1_all_reps_all_cell_lines_Untreated_without_CD41/distances_stats_euclidean_detailed_multiplexed.csv')
for col in ['label1','label2']:
    df2[col] = df2[col].str.split(pat='_', n=1).str[0]
In [127]:
df2.head()
Out[127]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p5 p10 p25 p50 p75 p90 p95 lower_whisker upper_whisker
0 C9-CS2YNL C9-CS2YNL 108 5778 0.291670 0.179715 0.584649 0.596096 0.616032 0.638600 0.661677 0.681940 0.694304 0.547564 0.730144
1 C9-CS2YNL C9-CS7VCZ 108 9720 0.000876 0.000325 0.679532 0.689829 0.707340 0.727445 0.748034 0.766935 0.777597 0.646299 0.809076
2 C9-CS2YNL C9-CS8RFT 108 11016 0.000159 0.000233 0.683546 0.694357 0.711962 0.732560 0.753342 0.772777 0.783976 0.649891 0.815413
3 C9-CS2YNL Ctrl-EDi022 108 9936 0.000155 0.000231 0.928019 0.937688 0.954176 0.971275 0.988407 1.003337 1.012553 0.902828 1.039755
4 C9-CS2YNL Ctrl-EDi029 108 8316 0.000155 0.000232 0.909294 0.917882 0.931749 0.947518 0.963163 0.976837 0.984205 0.884627 1.010285
In [128]:
plot_custom_boxplot(df2)
In [129]:
plot_dist_histogram(df2)
In [ ]:
plot_distances_heatmap(df2, )#figsize=(6,6), )#highlight_thresh=df2['p50'].median())
In [131]:
plot_cluster_proximity_network(df2, threshold = df2['p50'].quantile(0.9), )#figsize=(7,7))
In [ ]:
 

Older Alyssa¶

In [3]:
dist_folder_prev = '/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/AlyssaCoyne/distances'
In [9]:
df2 = pd.read_csv(f'{dist_folder_prev}/batch1_all_reps_all_cell_lines_all_conditions_all_markers/distances_stats_euclidean_detailed_multiplexed.csv')
In [10]:
df2.head()
Out[10]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p5 p10 p25 p50 p75 p90 p95 lower_whisker upper_whisker
0 Controls_rep1 Controls_rep1 129 8256 0.337314 0.089434 0.429234 0.448242 0.485859 0.533345 0.586872 0.646789 0.679897 0.334341 0.738390
1 Controls_rep1 Controls_rep2 129 15996 0.000955 0.000404 0.442218 0.465512 0.508212 0.561499 0.621011 0.676314 0.708968 0.339013 0.790209
2 Controls_rep1 Controls_rep3 129 9288 0.000214 0.000344 0.486758 0.512115 0.558513 0.615220 0.681248 0.747007 0.788203 0.374411 0.865350
3 Controls_rep1 Controls_rep4 129 20382 0.004514 0.000363 0.489501 0.511124 0.549412 0.596442 0.648816 0.698344 0.726855 0.400307 0.797921
4 Controls_rep1 Controls_rep5 129 18060 0.004086 0.000349 0.443409 0.465954 0.505666 0.553365 0.607189 0.662562 0.698276 0.353382 0.759474
In [12]:
plot_custom_boxplot(df2)
In [13]:
plot_dist_histogram(df2)
In [14]:
plot_replicate_bars_extended(df2)
In [ ]:
plot_distances_heatmap(df2, fmt = ".1f")
In [20]:
plot_cluster_proximity_network(df2, threshold = df2['p50'].quantile(0.9))

Distances between lines & reps for each marker¶

In [2]:
dis_path = "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/AlyssaCoyne/distances/batch1_all_reps_Controls_sALSPositiveCytoTDP43_sALSNegativeCytoTDP43_c9orf72ALSPatients_all_conditions_without_CD41/distances_stats_euclidean.csv"
df = pd.read_csv(dis_path)
df
Out[2]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p50
0 DAPI_Controls_rep1 DAPI_Controls_rep1 129 8256 0.262671 0.066571 0.446008
1 DAPI_Controls_rep1 DAPI_Controls_rep2 129 15996 0.001063 0.000172 0.489870
2 DAPI_Controls_rep1 DAPI_Controls_rep3 129 9288 0.000291 0.000120 0.530618
3 DAPI_Controls_rep1 DAPI_Controls_rep4 129 20382 0.000251 0.000114 0.530957
4 DAPI_Controls_rep1 DAPI_Controls_rep5 129 18060 0.000238 0.000118 0.476871
... ... ... ... ... ... ... ...
3565 TDP43_sALSPositiveCytoTDP43_rep7 TDP43_sALSPositiveCytoTDP43_rep8 127 12446 0.000190 0.000113 0.665437
3566 TDP43_sALSPositiveCytoTDP43_rep7 TDP43_sALSPositiveCytoTDP43_rep9 127 11938 0.000189 0.000109 0.718668
3567 TDP43_sALSPositiveCytoTDP43_rep8 TDP43_sALSPositiveCytoTDP43_rep8 98 4753 0.000835 0.000122 0.605611
3568 TDP43_sALSPositiveCytoTDP43_rep8 TDP43_sALSPositiveCytoTDP43_rep9 98 9212 0.000183 0.000104 0.677670
3569 TDP43_sALSPositiveCytoTDP43_rep9 TDP43_sALSPositiveCytoTDP43_rep9 94 4371 0.000755 0.000111 0.540008

3570 rows × 7 columns

In [14]:
# Filter marker-to-marker distances
df['marker1'] = df['label1'].str.split(pat='_', n=1).str[0]
df['marker2'] = df['label2'].str.split(pat='_', n=1).str[0]
df_filtered = df[df['marker1'] == df['marker2']]
df_filtered
Out[14]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p50 marker1 marker2
0 DAPI_Controls_rep1 DAPI_Controls_rep1 129 8256 0.262671 0.066571 0.446008 DAPI DAPI
1 DAPI_Controls_rep1 DAPI_Controls_rep2 129 15996 0.001063 0.000172 0.489870 DAPI DAPI
2 DAPI_Controls_rep1 DAPI_Controls_rep3 129 9288 0.000291 0.000120 0.530618 DAPI DAPI
3 DAPI_Controls_rep1 DAPI_Controls_rep4 129 20382 0.000251 0.000114 0.530957 DAPI DAPI
4 DAPI_Controls_rep1 DAPI_Controls_rep5 129 18060 0.000238 0.000118 0.476871 DAPI DAPI
... ... ... ... ... ... ... ... ... ...
3565 TDP43_sALSPositiveCytoTDP43_rep7 TDP43_sALSPositiveCytoTDP43_rep8 127 12446 0.000190 0.000113 0.665437 TDP43 TDP43
3566 TDP43_sALSPositiveCytoTDP43_rep7 TDP43_sALSPositiveCytoTDP43_rep9 127 11938 0.000189 0.000109 0.718668 TDP43 TDP43
3567 TDP43_sALSPositiveCytoTDP43_rep8 TDP43_sALSPositiveCytoTDP43_rep8 98 4753 0.000835 0.000122 0.605611 TDP43 TDP43
3568 TDP43_sALSPositiveCytoTDP43_rep8 TDP43_sALSPositiveCytoTDP43_rep9 98 9212 0.000183 0.000104 0.677670 TDP43 TDP43
3569 TDP43_sALSPositiveCytoTDP43_rep9 TDP43_sALSPositiveCytoTDP43_rep9 94 4371 0.000755 0.000111 0.540008 TDP43 TDP43

924 rows × 9 columns

In [16]:
markers = df_filtered['marker1'].unique()
for marker in markers:
    df_marker = df_filtered[df_filtered['marker1'] == marker]
    print(f"Plotting distances for marker: {marker}")
    plot_distances_heatmap(df_marker, fmt = ".1f")
Plotting distances for marker: DAPI
Plotting distances for marker: DCP1A
Plotting distances for marker: Map2
Plotting distances for marker: TDP43

Distances between gene groups (no reps)¶

In [17]:
df_path = "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/AlyssaCoyne/distances_test/batch1_all_reps_Controls_sALSPositiveCytoTDP43_sALSNegativeCytoTDP43_c9orf72ALSPatients_all_conditions_without_CD41/distances_stats_euclidean.csv"

df = pd.read_csv(df_path)
df
Out[17]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p50
0 DAPI_Controls DAPI_Controls 702 246051 0.080557 0.037627 0.538911
1 DAPI_Controls DAPI_c9orf72ALSPatients 702 248508 0.022959 0.000291 1.059778
2 DAPI_Controls DAPI_sALSNegativeCytoTDP43 702 122850 0.000246 0.003721 0.745601
3 DAPI_Controls DAPI_sALSPositiveCytoTDP43 702 732888 0.000473 0.000357 0.653484
4 DAPI_Controls DCP1A_Controls 702 497016 0.000445 0.000179 0.888739
... ... ... ... ... ... ... ...
131 TDP43_c9orf72ALSPatients TDP43_sALSNegativeCytoTDP43 354 61950 0.000155 0.000241 0.970611
132 TDP43_c9orf72ALSPatients TDP43_sALSPositiveCytoTDP43 354 369576 0.000311 0.000278 0.940574
133 TDP43_sALSNegativeCytoTDP43 TDP43_sALSNegativeCytoTDP43 175 15225 0.001135 0.000096 0.664504
134 TDP43_sALSNegativeCytoTDP43 TDP43_sALSPositiveCytoTDP43 175 182700 0.020715 0.006226 0.882345
135 TDP43_sALSPositiveCytoTDP43 TDP43_sALSPositiveCytoTDP43 1044 544446 0.006292 0.000201 0.698787

136 rows × 7 columns

In [18]:
# Filter marker-to-marker distances
df['marker1'] = df['label1'].str.split(pat='_', n=1).str[0]
df['marker2'] = df['label2'].str.split(pat='_', n=1).str[0]
df_filtered = df[df['marker1'] == df['marker2']]
df_filtered
Out[18]:
label1 label2 block_size total_pairs dist_time_s stats_time_s p50 marker1 marker2
0 DAPI_Controls DAPI_Controls 702 246051 0.080557 0.037627 0.538911 DAPI DAPI
1 DAPI_Controls DAPI_c9orf72ALSPatients 702 248508 0.022959 0.000291 1.059778 DAPI DAPI
2 DAPI_Controls DAPI_sALSNegativeCytoTDP43 702 122850 0.000246 0.003721 0.745601 DAPI DAPI
3 DAPI_Controls DAPI_sALSPositiveCytoTDP43 702 732888 0.000473 0.000357 0.653484 DAPI DAPI
16 DAPI_c9orf72ALSPatients DAPI_c9orf72ALSPatients 354 62481 0.002157 0.010732 0.557435 DAPI DAPI
17 DAPI_c9orf72ALSPatients DAPI_sALSNegativeCytoTDP43 354 61950 0.000192 0.000237 0.898647 DAPI DAPI
18 DAPI_c9orf72ALSPatients DAPI_sALSPositiveCytoTDP43 354 369576 0.000327 0.023004 0.964783 DAPI DAPI
31 DAPI_sALSNegativeCytoTDP43 DAPI_sALSNegativeCytoTDP43 175 15225 0.002754 0.000105 0.555436 DAPI DAPI
32 DAPI_sALSNegativeCytoTDP43 DAPI_sALSPositiveCytoTDP43 175 182700 0.000336 0.000249 0.712668 DAPI DAPI
45 DAPI_sALSPositiveCytoTDP43 DAPI_sALSPositiveCytoTDP43 1044 544446 0.006981 0.000204 0.668043 DAPI DAPI
58 DCP1A_Controls DCP1A_Controls 708 250278 0.004188 0.014790 0.598835 DCP1A DCP1A
59 DCP1A_Controls DCP1A_c9orf72ALSPatients 708 251340 0.000169 0.000253 1.081521 DCP1A DCP1A
60 DCP1A_Controls DCP1A_sALSNegativeCytoTDP43 708 125316 0.000166 0.003610 0.791463 DCP1A DCP1A
61 DCP1A_Controls DCP1A_sALSPositiveCytoTDP43 708 739152 0.000448 0.000347 0.726276 DCP1A DCP1A
70 DCP1A_c9orf72ALSPatients DCP1A_c9orf72ALSPatients 355 62835 0.002151 0.007859 0.655756 DCP1A DCP1A
71 DCP1A_c9orf72ALSPatients DCP1A_sALSNegativeCytoTDP43 355 62835 0.000168 0.000238 0.876319 DCP1A DCP1A
72 DCP1A_c9orf72ALSPatients DCP1A_sALSPositiveCytoTDP43 355 370620 0.000324 0.017184 1.009766 DCP1A DCP1A
81 DCP1A_sALSNegativeCytoTDP43 DCP1A_sALSNegativeCytoTDP43 177 15576 0.001156 0.000099 0.676418 DCP1A DCP1A
82 DCP1A_sALSNegativeCytoTDP43 DCP1A_sALSPositiveCytoTDP43 177 184788 0.008280 0.000236 0.784279 DCP1A DCP1A
91 DCP1A_sALSPositiveCytoTDP43 DCP1A_sALSPositiveCytoTDP43 1044 544446 0.006173 0.000198 0.747399 DCP1A DCP1A
100 Map2_Controls Map2_Controls 708 250278 0.004132 0.000393 0.579124 Map2 Map2
101 Map2_Controls Map2_c9orf72ALSPatients 708 251340 0.000162 0.002713 1.010065 Map2 Map2
102 Map2_Controls Map2_sALSNegativeCytoTDP43 708 125316 0.000157 0.000261 0.745220 Map2 Map2
103 Map2_Controls Map2_sALSPositiveCytoTDP43 708 739152 0.000294 0.007940 0.687323 Map2 Map2
108 Map2_c9orf72ALSPatients Map2_c9orf72ALSPatients 355 62835 0.002167 0.017843 0.546210 Map2 Map2
109 Map2_c9orf72ALSPatients Map2_sALSNegativeCytoTDP43 355 62835 0.000159 0.000237 0.798772 Map2 Map2
110 Map2_c9orf72ALSPatients Map2_sALSPositiveCytoTDP43 355 370620 0.000300 0.000286 0.905957 Map2 Map2
115 Map2_sALSNegativeCytoTDP43 Map2_sALSNegativeCytoTDP43 177 15576 0.001158 0.000101 0.638141 Map2 Map2
116 Map2_sALSNegativeCytoTDP43 Map2_sALSPositiveCytoTDP43 177 184788 0.000309 0.000236 0.708084 Map2 Map2
121 Map2_sALSPositiveCytoTDP43 Map2_sALSPositiveCytoTDP43 1044 544446 0.011010 0.000196 0.696022 Map2 Map2
126 TDP43_Controls TDP43_Controls 702 246051 0.004106 0.003410 0.579092 TDP43 TDP43
127 TDP43_Controls TDP43_c9orf72ALSPatients 702 248508 0.000156 0.006422 1.029109 TDP43 TDP43
128 TDP43_Controls TDP43_sALSNegativeCytoTDP43 702 122850 0.000156 0.000258 0.778982 TDP43 TDP43
129 TDP43_Controls TDP43_sALSPositiveCytoTDP43 702 732888 0.000298 0.000347 0.740959 TDP43 TDP43
130 TDP43_c9orf72ALSPatients TDP43_c9orf72ALSPatients 354 62481 0.002132 0.003955 0.680392 TDP43 TDP43
131 TDP43_c9orf72ALSPatients TDP43_sALSNegativeCytoTDP43 354 61950 0.000155 0.000241 0.970611 TDP43 TDP43
132 TDP43_c9orf72ALSPatients TDP43_sALSPositiveCytoTDP43 354 369576 0.000311 0.000278 0.940574 TDP43 TDP43
133 TDP43_sALSNegativeCytoTDP43 TDP43_sALSNegativeCytoTDP43 175 15225 0.001135 0.000096 0.664504 TDP43 TDP43
134 TDP43_sALSNegativeCytoTDP43 TDP43_sALSPositiveCytoTDP43 175 182700 0.020715 0.006226 0.882345 TDP43 TDP43
135 TDP43_sALSPositiveCytoTDP43 TDP43_sALSPositiveCytoTDP43 1044 544446 0.006292 0.000201 0.698787 TDP43 TDP43
In [19]:
markers = df_filtered['marker1'].unique()
for marker in markers:
    df_marker = df_filtered[df_filtered['marker1'] == marker]
    print(f"Plotting distances for marker: {marker}")
    plot_distances_heatmap(df_marker, fmt = ".1f")
Plotting distances for marker: DAPI
Plotting distances for marker: DCP1A
Plotting distances for marker: Map2
Plotting distances for marker: TDP43